1 ///////////////////////////////////////////////////////////////
2 // Aho-Corasick's algorithm, as explained in //
3 // http://dx.doi.org/10.1145/360825.360855 //
4 ///////////////////////////////////////////////////////////////
6 // Max number of states in the matching machine.
7 // Should be equal to the sum of the length of all keywords.
8 const int MAXS
= 6 * 50 + 10;
10 // Number of characters in the alphabet.
13 // Output for each state, as a bitwise mask.
14 // Bit i in this mask is on if the keyword with index i
15 // appears when the machine enters this state.
18 // Used internally in the algorithm.
19 int f
[MAXS
]; // Failure function
20 int g
[MAXS
][MAXC
]; // Goto function, or -1 if fail.
22 // Builds the string matching machine.
24 // words - Vector of keywords. The index of each keyword is
26 // "out[state] & (1 << i)" is > 0 if we just found
27 // word[i] in the text.
28 // lowestChar - The lowest char in the alphabet.
30 // highestChar - The highest char in the alphabet.
32 // "highestChar - lowestChar" must be <= MAXC,
33 // otherwise we will access the g matrix outside
34 // its bounds and things will go wrong.
36 // Returns the number of states that the new machine has.
37 // States are numbered 0 up to the return value - 1, inclusive.
38 int buildMatchingMachine(const vector
<string
> &words
,
39 char lowestChar
= 'a',
40 char highestChar
= 'z') {
41 memset(out
, 0, sizeof out
);
42 memset(f
, -1, sizeof f
);
43 memset(g
, -1, sizeof g
);
45 int states
= 1; // Initially, we just have the 0 state
47 for (int i
= 0; i
< words
.size(); ++i
) {
48 const string
&keyword
= words
[i
];
50 for (int j
= 0; j
< keyword
.size(); ++j
) {
51 int c
= keyword
[j
] - lowestChar
;
52 if (g
[currentState
][c
] == -1) {
53 // Allocate a new node
54 g
[currentState
][c
] = states
++;
56 currentState
= g
[currentState
][c
];
58 // There's a match of keywords[i] at node currentState.
59 out
[currentState
] |= (1 << i
);
62 // State 0 should have an outgoing edge for all characters.
63 for (int c
= 0; c
< MAXC
; ++c
) {
69 // Now, let's build the failure function
71 // Iterate over every possible input
72 for (int c
= 0; c
<= highestChar
- lowestChar
; ++c
) {
73 // All nodes s of depth 1 have f[s] = 0
74 if (g
[0][c
] != -1 and g
[0][c
] != 0) {
80 int state
= q
.front();
82 for (int c
= 0; c
<= highestChar
- lowestChar
; ++c
) {
83 if (g
[state
][c
] != -1) {
84 int failure
= f
[state
];
85 while (g
[failure
][c
] == -1) {
88 failure
= g
[failure
][c
];
89 f
[g
[state
][c
]] = failure
;
92 out
[g
[state
][c
]] |= out
[failure
];
101 // Finds the next state the machine will transition to.
103 // currentState - The current state of the machine. Must be
104 // between 0 and the number of states - 1,
106 // nextInput - The next character that enters into the machine.
107 // Should be between lowestChar and highestChar,
109 // lowestChar - Should be the same lowestChar that was passed
110 // to "buildMatchingMachine".
112 // Returns the next state the machine will transition to.
113 // This is an integer between 0 and the number of states - 1,
115 int findNextState(int currentState
, char nextInput
,
116 char lowestChar
= 'a') {
117 int answer
= currentState
;
118 int c
= nextInput
- lowestChar
;
119 while (g
[answer
][c
] == -1) answer
= f
[answer
];
124 // How to use this algorithm:
126 // 1. Modify the MAXS and MAXC constants as appropriate.
127 // 2. Call buildMatchingMachine with the set of keywords to
129 // 3. Start at state 0. Call findNextState to incrementally
130 // transition between states.
131 // 4. Check the out function to see if a keyword has been
136 // Assume keywords is a vector that contains
137 // {"he", "she", "hers", "his"} and text is a string that
138 // contains "ahishers".
140 // Consider this program:
142 // buildMatchingMachine(keywords, 'a', 'z');
143 // int currentState = 0;
144 // for (int i = 0; i < text.size(); ++i) {
145 // currentState = findNextState(currentState, text[i], 'a');
147 // Nothing new, let's move on to the next character.
148 // if (out[currentState] == 0) continue;
150 // for (int j = 0; j < keywords.size(); ++j) {
151 // if (out[currentState] & (1 << j)) {
152 // //Matched keywords[j]
153 // cout << "Keyword " << keywords[j]
154 // << " appears from "
155 // << i - keywords[j].size() + 1
156 // << " to " << i << endl;
161 // The output of this program is:
163 // Keyword his appears from 1 to 3
164 // Keyword he appears from 4 to 5
165 // Keyword she appears from 3 to 5
166 // Keyword hers appears from 4 to 7
168 ///////////////////////////////////////////////////////////////
169 // End of Aho-Corasick's algorithm //
170 ///////////////////////////////////////////////////////////////